# -*- coding: utf-8 -*-
#from selenium import webdriver 
import time
from lxml import etree
import itertools  
#import random
import requests 
import json
import codecs

import sys 
reload(sys) 
sys.setdefaultencoding( "utf-8" ) 
from fake_useragent import UserAgent
ua = UserAgent()
file = codecs.open("E:\jiaocheng\F\Project\\lianjia\data\\lianjia_list.json",'wb',encoding = 'utf-8')
headers = {
		"User-Agent":ua.random,
		"Accept":"*/*",
		"Accept-Encoding":"gzip, deflate",
		"Accept-Language":"zh-CN,zh;q=0.8",
		"Connection":"keep-alive",
		}
#请求url，获取网页代码并返回
def request_html(url):
	try:
		html = requests.get(url,headers = headers)
		time.sleep(3)
		print "网页代码获取成功！"
		return html.text
	except:
		print "网页请求失败！！！"

#提取数据并保存
def get_info_and_save(html):
	print "正在提取数据......"
	tree = etree.HTML(html)
	info = tree.xpath('/html/body/div[4]/div[1]/ul/li')
	print len(info)
	for tree in info:
		#标题
		title = tree.xpath('div[1]/div[1]/a/text()')
		#楼盘名称
		loupan = tree.xpath('div[1]/div[2]/div/a/text()')
		#房间信息
		house_info = tree.xpath('div[1]/div[2]/div/text()')
		#地理位置
		position = tree.xpath('div[1]/div[3]/div/text()')
		#所在街道
		place = tree.xpath('div[1]/div[3]/div/a/text()')
		#关注人数、看房人数、发布日期
		follow_info = tree.xpath('div[1]/div[4]/text()')
		#特殊介绍
		special_info = tree.xpath('div[1]/div[5]/span/text()')
		#价格
		price = tree.xpath('div[1]/div[6]/div[1]/span/text()')
		#约合单价
		unit_price = tree.xpath('div[1]/div[6]/div[2]/span/text()')
		#for s in special_info:
		#	special_info = "--".join(s)
		s = "-*-".join(itertools.chain(special_info)) 
		#print title[0],loupan[0],house_info[0],position[0],place[0],follow_info[0],s,price[0],unit_price[0],"\n\n***************"
		print "*****准备保存******"
		#print len(title)
		#print len(loupan)
		#print len(house_info)
		#print len(position)
		#print len(place)
		#print len(follow_info)
		#print len(special_info)
		#for info in special_info:
		#	print info
		try:
			dic = {
					"标题":title[0],
					"楼盘名称":loupan[0],
					"房间信息":house_info[0],
					"地理位置":position[0],
					"所在街道":place[0],
					"关注人数、看房人数、发布日期":follow_info[0],
					"特殊介绍":s,
					"价格":price[0],
					"约合单价/平方米":unit_price[0],
				}
			dicts = json.dumps(dict(dic),ensure_ascii=False)
			line = dicts + '\n'
			#print "保存中......"
			file.write(line)
			print "数据写入成功！！！"
		except:
			print "数据写入失败！！！"

#主函数
def main():
	url = 'https://cs.lianjia.com/ershoufang/pg99/'
	for i in range(1,101):#总共101页
		url = 'https://cs.lianjia.com/ershoufang/pg' + str(i)
		print "正在爬取链接：%s"%url
		html = request_html(url)
		get_info_and_save(html)
		time.sleep(1)
	file.close()
if __name__ == '__main__':
	main()